import json
import pandas as pd
import numpy as np

# 读取数据
data = pd.read_csv('data.csv')

# 选择需要的列
col_keep = ['city', 'fields.comment', 'fields.discountPrice', 'fields.features', 'fields.itemId',
           'fields.itemTotalScore', 'fields.latitude', 'fields.longitude', 'fields.price',
           'fields.shortInfo', 'fields.sold365', 'fields.soldRecentNum', 'fields.tagList', 
           'fields.title', 'trip_main_busness_type']
data = data[col_keep]

# 去重
data.drop_duplicates(subset='fields.itemId', inplace=True)

# 数据类型转换
data.loc[data['fields.comment'].isnull(), 'fields.comment'] = np.nan
data['fields.comment'] = data['fields.comment'].astype(float)

# 处理features字段
def get_fea_text(data):
    data = data.replace('"', '\\"')
    data = data.replace("'", '"')
    data = data.replace("None", "null")
    data = json.loads(data)
    if len(data) > 0:
        return data[0]['text']
    else:
        return None

data['fields.features'] = data['fields.features'].apply(get_fea_text)

# 处理sold365字段
def get_sold365(data):
    if data is None or str(data)=='nan':
        return np.nan
    if "万" in data:
        return float(data[data.find('售')+1:data.find('万')]) * 10000
    else:
        return float(data[data.find('售')+1:data.find('笔')])

data['fields.sold365'] = data['fields.sold365'].apply(get_sold365)

# 保存处理后的数据
data.to_csv('data_clean.csv', index=False)